home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
AmigActive 10
/
AACD 10.iso
/
AACD
/
Online
/
SpeakFreely
/
src
/
lpc10
/
voice_nn.c
< prev
next >
Wrap
C/C++ Source or Header
|
2000-05-18
|
10KB
|
260 lines
/* SpeedEdit 184,196,24,0,0,10,16,10 Updated 08/10/92 09:10:50 */
#include <stdio.h>
#include <math.h>
#include "filter.h"
#define NFEATURE 3 /* number of features per voicing frame */
#define NICLASS 3*NFEATURE /* number of inputs to the neural net classifier */
#define LPF_LENG 3 /* length of low pass filter used to estimate average v & uv energy */
#define MAXDB 72.25 /* scale factor to normalize speech level */
/* 20.0*log10(4096), where 4096 = max signal amplitude */
#define V_NOMINAL 0.6 /* nominal value of Elow_v_hat */
#define UV_NOMINAL 0.3 /* nominal value of Elow_uv_hat */
#define NFRAMES_ELOW 440 /* number of (half) frames of Elow history to save */
#define BUF_FRACT .1 /* fraction of Elow_buf used to determine El and Eh */
#define THRESH .2 /* threshhold for Eh - El to declare speech present in Elow_buf */
#define NHIDDEN 2 /* number of hidden units used in the neural net classifier */
/* ---------------------------------------- CLASSIFY ----------------------------------------
*/
static float classify(x)
float x[];
{
int i;
float out[NHIDDEN+1];
static float bias[] = { 4.7605e-01, -4.4143e-01, -2.5532e+01 };
static float weight0[] = { 0.0000e+00, 9.9600e+00, 1.3399e+00 }; /* weight0[0] is not used */
static float weight1[] = { 2.5327e-01, 5.1214e-02, 5.1845e-02,
8.7389e-01, 6.6049e-02, 5.6143e-02,
2.1496e-02, 6.4269e-02, 7.5030e-03 };
static float weight2[] = { 9.1907e-01, 1.4349e+00, -5.3130e-03,
-2.1947e+00, 2.5551e+01, 4.0398e-01,
-1.9285e-01, -9.4621e-01, 7.7245e-01 };
/* hidden unit 1 */
out[1] = bias[1];
for (i=0; i<NICLASS; i++) out[1] += weight1[i] * x[i];
out[1] = tanh((double) out[1]);
/* hidden unit 2 */
out[2] = bias[2];
for (i=0; i<NICLASS; i++) out[2] += weight2[i] * x[i];
out[2] = tanh((double) out[2]);
/* output unit */
out[0] = bias[0];
for (i=1; i<=NHIDDEN; i++) out[0] += weight0[i] * out[i];
return tanh((double) out[0]);
}
/* ---------------------------------------- AUTOCOR ----------------------------------------
*/
static float autocor(signal, n, lag)
float signal[];
int n, lag;
{
int i, p1, p2;
float cor1, cor2, cor3;
/*p1 = lag / 2; /* to be centered in signal window, go back half of lag */
p1 = lag >> 1; /* to be centered in signal window, go back half of lag */
p2 = lag - p1; /* and forward the difference */
cor1 = cor2 = cor3 = 0.0;
for (i=0; i<n; i++)
{
cor1 += signal[i-p1] * signal[i+p2];
cor2 += signal[i-p1] * signal[i-p1];
cor3 += signal[i+p2] * signal[i+p2];
}
if (cor2 == 0.0 || cor3 == 0.0) return 0.0;
else return (cor1 * cor1) / (cor2 * cor3); /* range: 0 to +1 */
}
/* ---------------------------------------- SINSERT ----------------------------------------
This routine re-sorts a buffer which has a single (new) entry (perhaps) out of place.
'sbuf' is the array sorted in ascending order; 'obuf' is another array to be re-ordered in the same manner;
'k' is the index of the newly inserted entry.
*/
static void sinsert(sbuf, obuf, n, k)
float sbuf[]; /* sbuf[0:n-1] */
int obuf[], n, k; /* obuf[0:n-1] */
{
int i, j, tempi;
float temp;
if (k < n-1 && sbuf[k] > sbuf[k+1]) /* is new datum inserted too low in buffer? */
{
for (i=k+2; i<n; i++) if (sbuf[k] <= sbuf[i]) break; /* i points to value to follow sbuf[k] */
temp = sbuf[k];
tempi = obuf[k];
for (j=k; j<i-1; j++)
{
sbuf[j] = sbuf[j+1];
obuf[j] = obuf[j+1];
}
sbuf[i-1] = temp;
obuf[i-1] = tempi;
}
else if (k > 0 && sbuf[k] < sbuf[k-1]) /* is new datum inserted too high in buffer? */
{
for (i=k-2; i>=0; i--) if (sbuf[k] >= sbuf[i]) break; /* i points to value to precede sbuf[k] */
temp = sbuf[k];
tempi = obuf[k];
for (j=k; j>i+1; j--)
{
sbuf[j] = sbuf[j-1];
obuf[j] = obuf[j-1];
}
sbuf[i+1] = temp;
obuf[i+1] = tempi;
}
}
/* ---------------------------------------- VFEATURES ----------------------------------------
*/
static void vfeatures(speech, lpspeech, length, pitch, features)
float speech[], lpspeech[], features[]; /* all indices start at 0 */
int length, pitch;
{
int i, nframes;
float Elow, Elown, Flow, Cpch;
float ss_low, ss_full, temp, El, Eh, Ethresh;
static int first = 1; /* flag to indicate first execution of this routine */
static int indx = 0;
static float Elow_uv_hat; /* estimate of average Elow when speech is absent */
static float Elow_v_hat; /* estimate of average Elow when speech is present */
static float avg_num[] = { 1.60283e-4, 3.20566e-4, 1.60283e-4 };
static float avg_den[] = { 1.0, -1.949359, 0.95 };
static float Elow_buf[NFRAMES_ELOW]; /* the past NFRAMES_ELOW of Elow values */
static int Elow_order[NFRAMES_ELOW];
static FILTER *uvnf, *vnf;
static float uvnf_state[] = { 468.053, 468.053 };
static float vnf_state[] = { 936.107, 936.107 };
if (first) /* initialize on first pass */
{
first = 0;
/* create the two filters used to estimate Elow when speech absent and present */
uvnf = filter_create(avg_num, LPF_LENG, avg_den, LPF_LENG);
vnf = filter_create(avg_num, LPF_LENG, avg_den, LPF_LENG);
/* get the averaging filter outputs to nominal values */
filter_state_set(uvnf, 2, uvnf_state);
filter_state_set( vnf, 2, vnf_state);
Elow_v_hat = filter(vnf, V_NOMINAL);
Elow_uv_hat = filter(uvnf, UV_NOMINAL);
/* initialize Elow_buf to nominal values */
for (i=0; i<NFRAMES_ELOW; i++)
{
if (i < .5 * NFRAMES_ELOW) Elow_buf[i] = UV_NOMINAL;
else Elow_buf[i] = V_NOMINAL;
Elow_order[i] = i;
}
}
/* Elow: low band energy */
ss_low = 0.0;
for (i=0; i<length; i++) ss_low += lpspeech[i] * lpspeech[i];
Elow = ss_low / length;
if (Elow > 0.0) Elow = 10.0 * log10((double) Elow) - MAXDB; /* convert to dB; fully loaded = 0 dB */
else Elow = -100.0; /* -100 dB if zero input */
if (Elow < -100.0) Elow = -100.0; /* clamp negative end of range */
/*Elow = (Elow / 50.0) + 1.0; /* -100 dB to 0 dB is scaled -1.0 to +1.0 */
Elow = (Elow * 20e-3) + 1.0; /* -100 dB to 0 dB is scaled -1.0 to +1.0 */
/* Determine the average speaking level */
for (i=0; i<NFRAMES_ELOW; i++) if (Elow_order[i] == indx) break; /* i = index of oldest Elow_buf sample */
Elow_buf[i] = Elow; /* replace oldest sample with newest */
sinsert(Elow_buf, Elow_order, NFRAMES_ELOW, i); /* re-sort buffers after insertion at index i */
/** indx = ++indx % NFRAMES_ELOW; /* increment indx modulo NFRAMES_ELOW */
indx++;
if(indx == NFRAMES_ELOW) indx = 0;
El = Eh = 0.0;
nframes = BUF_FRACT * NFRAMES_ELOW;
for (i=0; i<nframes; i++)
{
El += Elow_buf[i]; /* sum the lowest BUF_FRACT of Elow_buf */
Eh += Elow_buf[NFRAMES_ELOW-1 - i]; /* sum the highest BUF_FRACT of Elow_buf */
}
/*El /= nframes;*/
El *= 0.02272727272;
/*Eh /= nframes;*/
Eh *= 0.02272727272;
if (Eh - El > THRESH) Ethresh = .5 * (El + Eh); /* if it's likely that Elow_buf has speech */
else Ethresh = .5 * (Elow_v_hat + Elow_uv_hat);
if (Elow > Ethresh) Elow_v_hat = filter( vnf, Elow); /* speech present this half-frame */
else Elow_uv_hat = filter(uvnf, Elow); /* speech absent this half-frame */
/* Elown: Elow normalized */
Elown = Elow - Elow_v_hat + V_NOMINAL;
/* Flow: fraction of full band energy in low band */
ss_full = 0.0;
for (i=0; i<length; i++) ss_full += speech[i] * speech[i];
if (ss_full > 0.0) Flow = ss_low / ss_full;
else Flow = 0.0;
if (Flow > 1.0) Flow = 1.0; /* this happens quite often! */
Flow = 2.0 * Flow - 1.0; /* range: -1 to +1 */
/* Cpch: autocorrelation of LPF(speech) @ one pitch period */
Cpch = -2.0;
for (i=-1; i<=1; i++)
{
temp = 2.0 * autocor(lpspeech, length, pitch+i) - 1.0; /* range: -1 to +1 */
if (temp > Cpch) Cpch = temp;
}
features[0] = Elown;
features[1] = Flow;
features[2] = Cpch;
}
/* ---------------------------------------- VOICING ----------------------------------------
Determines two half-frame voicing decisions for the frame speech[start:end].
*/
void voicing(speech, lpspeech, start, end, pitch, v1, v2)
float speech[], lpspeech[];
int start, end, pitch, *v1, *v2;
{
int i, vfleng;
float vclass;
static float fbuf[4*NFEATURE] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
/* shift the feature buffer by two voicing frames */
for (i=0; i<2*NFEATURE; i++) fbuf[i] = fbuf[i + 2*NFEATURE];
/*vfleng = (end - start + 1) / 2; /* voicing frame length */
vfleng = (end - start + 1) >> 1; /* voicing frame length */
vfeatures(speech+start+vfleng, lpspeech+start+vfleng, vfleng, pitch, fbuf+6);
vfeatures(speech+start+2*vfleng, lpspeech+start+2*vfleng, vfleng, pitch, fbuf+9);
/* make the voicing classification for the first half frame */
*v1 = *v2 = 0;
vclass = classify(fbuf);
if (vclass >= 0.0) *v1 = 1;
/* make the voicing classification for the second half frame */
vclass = classify(fbuf+NFEATURE);
if (vclass >= 0.0) *v2 = 1;
}